#################################################
##   Load the data                          #####
#################################################

setwd(wd_data)
data = read.csv('main_dataset.csv')
data = data[!is.na(data$sruT),]

# Define individual lead and lags to the treatment period
data$policy_period_ind = ifelse((is.na(data$sru_period)), 0, data$policy_period - data$sru_period)


#################################################
##    Data adjustment      #####
#################################################

# vote shares
data$left = data$left/100
data$right = data$right/100
data$extreme.left = data$extreme.left/100
data$turnout = data$turnout/100
data$fn = data$fn/100


# create share of home ownership
data$res_owned_share = data$num_res_own/data$num_res
data$empty_log_share = data$num_empty_log/data$num_res

# create pre-policy shares
df1 = data[which(data$year == 1999), c('CODGEO', 'hlm_share', 'res_owned_share', 'empty_log_share')]
colnames(df1) = c('CODGEO', 'hlm_share_90', 'res_owned_share_90', 'empty_log_share_90')
data = merge(data, df1, on='CODGEO', all.x = T)

rm(df1)

# create pre-policy shares
df1 = data[which(data$year == 2007), c('CODGEO', 'pop', 'num_french')]
colnames(df1) = c('CODGEO', 'pop_07', 'num_french_07')
data = merge(data, df1, on='CODGEO', all.x = T)

rm(df1)


#################################################
##   Subset the sample For Diff in Disc     #####
#################################################

# Include only treated agglomerations: no rural places nor untreated agglomerations
# I.e. all urban agglomerations with at least one treated municipality
df_did = data[which(data$agglo_sru==1), ]  

# limit to the municipalities that are relatively close to threshold
df_did = df_did[abs(df_did$running)<bdw,]

# Exclude large cities (already done by pop threhsold above)
df_did = df_did[!(df_did$libcom %in% c('Paris', 'Lyon', 'Marseille')), ]

# Include only election years/periods
df_did = df_did[which(!is.na(df_did$fn)), ]



#################################################
##   quantiles                              #####
#################################################

# # Adjust variables

# immigration 1990 quantiles
df_did$imm_quant_90 = cut(df_did$imm_share_90,
                          breaks=c(quantile(df_did$imm_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                          labels=c("1","2","3"), include.lowest=T)

# immigration 1999 quantiles
df_did$imm_quant_99 = cut(df_did$imm_share_99,
                          breaks=c(quantile(df_did$imm_share_99, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                          labels=c("1","2","3"), include.lowest=T)

# immigration change quantiles
df_did$delta_imm = df_did$imm_share_99 - df_did$imm_share_90
df_did$delta_imm_quant = cut(df_did$delta_imm,
                             breaks=c(quantile(df_did$delta_imm, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                             labels=c("1","2","3"), include.lowest=T)


# hlm housing 1990 quantiles
df_did$hlm_quant = cut(df_did$hlm_share_90,
                       breaks=c(quantile(df_did$hlm_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)

# home ownership 1990 quantiles
df_did$res_quant = cut(df_did$res_owned_share_90,
                       breaks=c(quantile(df_did$res_owned_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)

# median income 2001 quantiles
df_did$inc_quant = cut(df_did$median_inc_01,
                       breaks=c(quantile(df_did$median_inc_01, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)


# median income 2001 quantiles
df_did$fn_quant = cut(df_did$fn_1995,
                       breaks=c(quantile(df_did$fn_1995, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)


# empty dwellings 1990 quantiles
df_did$empty_quant = cut(df_did$empty_log_share_90,
                            breaks=c(quantile(df_did$empty_log_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                            labels=c("1","2","3"), include.lowest=T)
